In [2]:
#================================================#
# Machine Learning Lecture 3 in Python
# Author: Chong Ma
# Date  : June 26, 2017 
#================================================#
In [3]:
#================================================#
# import Python library (just like library in R)
# that will be used in this lecture 
#================================================#
# update jupyter notebook: pip install -U jupyter
import numpy as np
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
In [4]:
# one way to load iris data set
from sklearn import datasets
iris=datasets.load_iris()
iris_data=pd.DataFrame(iris.data)
iris_class=pd.DataFrame(np.repeat(iris.target_names,[50,50,50]))
dataset=pd.concat([iris_data,iris_class],axis=1)
dataset.columns=["Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"]

# Another way to load iris data set
# dataset = pd.read_csv("iris.csv", index_col=False,mangle_dupe_cols=True)
In [5]:
# print the first and last 5 subjects
dataset.head(5), dataset.tail(5)
Out[5]:
(   Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 0           5.1          3.5           1.4          0.2  setosa
 1           4.9          3.0           1.4          0.2  setosa
 2           4.7          3.2           1.3          0.2  setosa
 3           4.6          3.1           1.5          0.2  setosa
 4           5.0          3.6           1.4          0.2  setosa,
      Sepal.Length  Sepal.Width  Petal.Length  Petal.Width    Species
 145           6.7          3.0           5.2          2.3  virginica
 146           6.3          2.5           5.0          1.9  virginica
 147           6.5          3.0           5.2          2.0  virginica
 148           6.2          3.4           5.4          2.3  virginica
 149           5.9          3.0           5.1          1.8  virginica)
In [6]:
dataset.plot(kind='box', subplots=True, layout=(2,2), 
             figsize=(8,8),
             sharex=False, sharey=False)
plt.show()
In [7]:
plt.figure()
dataset.hist(figsize=(8,8))
plt.show()
<matplotlib.figure.Figure at 0x3f7a930>
In [8]:
# scatter plot matrix
plt.figure()
scatter_matrix(dataset,figsize=(10,10))
plt.show()
<matplotlib.figure.Figure at 0x8e2f2f0>
In [9]:
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, 
                                                                                random_state=seed)
In [10]:
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
In [11]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)
LR: 0.966667 (0.040825)
LDA: 0.975000 (0.038188)
KNN: 0.983333 (0.033333)
CART: 0.966667 (0.040825)
NB: 0.975000 (0.053359)
SVM: 0.991667 (0.025000)
In [12]:
pd.DataFrame(results).apply(lambda x: (np.mean(x),np.sqrt(np.var(x))),axis=1)
Out[12]:
0    (0.966666666667, 0.0408248290464)
1             (0.975, 0.0381881307913)
2    (0.983333333333, 0.0333333333333)
3    (0.966666666667, 0.0408248290464)
4             (0.975, 0.0533593686453)
5              (0.991666666667, 0.025)
dtype: object
In [13]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

Under a general hypothesis test $\mathrm{H}_0: \text{null}$ Versus $\mathrm{H}_1: \text{non-null}$,

$\mathrm{Precision}=P(\mathrm{H}_1| \text{Rejection})=\frac{\text{True Positive}}{\text{True Positive}+\text{False Positive}}$

$\mathrm{Recall}=P(\text{Rejection}|\mathrm{H}_1)=\frac{\text{True Positive}}{\text{True Positive}+\text{False Negative}}$

$\mathrm{F}_{1}\text{-score}=2\frac{\mathrm{Precision}*\mathrm{Recall}}{\mathrm{Precision}+\mathrm{Recall}}$

In [14]:
# Make predictions on validation dataset
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
0.9
[[ 7  0  0]
 [ 0 11  1]
 [ 0  2  9]]
             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00         7
 versicolor       0.85      0.92      0.88        12
  virginica       0.90      0.82      0.86        11

avg / total       0.90      0.90      0.90        30